{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "朴素贝叶斯的原理：TODO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import bayes\n",
    "import utils"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 一个简单的人造数据 --- 预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "listOPosts,listClasses = utils.loadDataSet()\n",
    "myVocabList = utils.createVocabList(listOPosts)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Expect Result:  \n",
    "myVocabList中包含了listOPosts中用到的所有单词。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "words =  ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']\n",
      "vec =  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0]\n"
     ]
    }
   ],
   "source": [
    "words = listOPosts[0]\n",
    "vec = utils.setOfWords2Vec(myVocabList, words)\n",
    "print (\"words = \", words)\n",
    "print (\"vec = \", vec)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Expect Result:  \n",
    "\n",
    "words =  ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']\n",
    "vec =  [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]  \n",
    "\n",
    "vec的长度即单词表中单词的个数。  \n",
    "单词表中第i个单词在words中存在，则把vec[i]置为1，否则为0。  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 一个简单的人造数据 --- 训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "listOPosts,listClasses = utils.loadDataSet()\n",
    "myVocabList = utils.createVocabList(listOPosts)\n",
    "trainMat = utils.Posts2Mat(myVocabList, listOPosts)\n",
    "p0V,p1V,pAb=bayes.trainNB0(trainMat,listClasses)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pAb =  0.5\n",
      "p0V =  [0.04166667 0.04166667 0.04166667 0.04166667 0.04166667 0.04166667\n",
      " 0.04166667 0.04166667 0.         0.04166667 0.04166667 0.\n",
      " 0.         0.         0.04166667 0.         0.         0.04166667\n",
      " 0.04166667 0.04166667 0.04166667 0.04166667 0.         0.\n",
      " 0.125      0.08333333 0.04166667 0.         0.04166667 0.\n",
      " 0.04166667 0.        ]\n",
      "p1V =  [0.         0.         0.         0.         0.         0.\n",
      " 0.         0.         0.05263158 0.         0.05263158 0.05263158\n",
      " 0.15789474 0.05263158 0.         0.05263158 0.05263158 0.\n",
      " 0.05263158 0.         0.         0.         0.05263158 0.05263158\n",
      " 0.         0.05263158 0.         0.10526316 0.10526316 0.05263158\n",
      " 0.         0.05263158]\n"
     ]
    }
   ],
   "source": [
    "print (\"pAb = \", pAb)\n",
    "print (\"p0V = \", p0V)\n",
    "print (\"p1V = \", p1V)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 一个简单的人造数据 --- 测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "classified as:  0\n"
     ]
    }
   ],
   "source": [
    "testEntry = ['love', 'my', 'dalmation']\n",
    "thisDoc = np.array(utils.setOfWords2Vec(myVocabList, testEntry))\n",
    "print ('classified as: ',bayes.classifyNBForOneData(thisDoc,p0V,p1V,pAb))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Expect Result:  \n",
    "classified as:  0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "classified as:  1\n"
     ]
    }
   ],
   "source": [
    "testEntry = ['stupid', 'garbage']\n",
    "thisDoc = np.array(utils.setOfWords2Vec(myVocabList, testEntry))\n",
    "print ('classified as: ',bayes.classifyNBForOneData(thisDoc,p0V,p1V,pAb))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Expect Result:  \n",
    "classified as:  1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Spam Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']\n"
     ]
    }
   ],
   "source": [
    "mySent='This book is the best book on Python or M.L. I have ever laid eyes upon.'\n",
    "testEntry = utils.preprocess(mySent)\n",
    "print (testEntry)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Expect Result:  \n",
    "\n",
    "['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']  \n",
    "\n",
    "提取出mySent中的单词，并全部转成小写。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataSet, labels = utils.loadSpamData()\n",
    "myVocabList = utils.createVocabList(dataSet)\n",
    "dataMat = utils.Posts2Mat(myVocabList, dataSet)\n",
    "train_X, train_y, test_X, test_y = utils.splitTrainAndTest(dataMat, labels)\n",
    "p0V,p1V,pAb=bayes.trainNB0(train_X,train_y)\n",
    "predict_y = bayes.classifyNB(test_X, p0V,p1V,pAb)\n",
    "utils.calculateAccuray(predict_y, test_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
